Take-home_Ex3

Author

Han Shumin

Published

June 2, 2023

Modified

June 14, 2023

Overview

The task

With reference to Mini-Challenge 3 of VAST Challenge 2023 and by using appropriate static and interactive statistical graphics methods, I will be revealing the:

Methodology

Data Preparation

Install and load the packages

The following code chunks will install and load the required packages.

Show the code
pacman::p_load(jsonlite, tidygraph, ggraph, 
               visNetwork, graphlayouts, ggforce, 
               skimr, tidytext, tidyverse, igraph, wordcloud, RColorBrewer, stringr, cluster)

Load the dataset in JSON format

In the code chunk below, from JSON() of jsonlite package is used to import MC3.json into R environment.

Show the code
MC3 <- fromJSON("data/MC3.json")

Data Wrangling

Initial Data Exploration

Exploring the edges data frame

In the code chunk below, skim() of skimr package is used to display the summary statistics of mc3_edges tibble data frame.

Show the code
skim(MC3_edges)
Data summary
Name MC3_edges
Number of rows 24036
Number of columns 4
_______________________
Column type frequency:
character 3
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
source 0 1 6 700 0 12856 0
target 0 1 6 28 0 21265 0
type 0 1 16 16 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
weights 0 1 1 0 1 1 1 1 1 ▁▁▇▁▁

The report above reveals that there is not missing values in all fields.

In the code chunk below, datatable() of DT package is used to display MC3_edges tibble dataframe as an interactive table on the html document.

Show the code
DT::datatable(MC3_edges)
Show the code
ggplot(data = MC3_edges,
       aes(x = type)) +
  geom_bar()

Initial Network Visualisation and Analysis

Building network model with tidygraph

Show the code
id1 <- MC3_edges %>%
  select(source) %>%
  rename(id = source)

id2 <- MC3_edges %>%
  select(id = target)

MC3_nodes1 <- rbind(id1, id2) %>%
  distinct() %>%
  left_join(MC3_nodes, unmatched = "drop")
Show the code
MC3_graph <- tbl_graph(nodes = MC3_nodes1,
                       edges = MC3_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness(),
         closeness_centrality = as.factor(centrality_closeness())) %>%
  mutate(degree_centrality = centrality_degree()) %>%
  filter(betweenness_centrality >= 30000) %>%
  filter(degree_centrality >= 3)
Show the code
# Calculate the degrees of each node
degrees <- degree(MC3_graph)
set.seed (1234)
MC3_graph %>%
  # filter(betweenness_centrality >= 100000) %>%
ggraph(layout = "fr") +
  geom_edge_link(aes(alpha = 0.5)) +
  geom_node_point(aes(size = betweenness_centrality,
                      color = closeness_centrality,
                      alpha = 0.5), show.legend = FALSE) +
  geom_node_text(aes(label = ifelse(degrees > 3, as.character(id), "")), size = 2) +  # Add node labels
  scale_size_continuous(range = c(1, 10)) +
  theme_graph()

Show the code
MC3_graph <- tbl_graph(nodes = MC3_nodes1,
                       edges = MC3_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness()) %>%
  filter(betweenness_centrality >= 10000)

quantile_graph <- quantile(eigen_centrality(MC3_graph)$vector,
         probs = seq(0, 1, 1/10)
         )
V(MC3_graph)$size = eigen_centrality(MC3_graph)$vector

MC3_graph_aggregated <- delete_vertices(MC3_graph, V(MC3_graph)[size < quantile_graph[10]])


set.seed (1234)
layout1 <- layout_with_fr(MC3_graph_aggregated)

quantile_graph_aggregated <- quantile(V(MC3_graph_aggregated)$size, #identify top 20% of the new vertices
         probs = seq(0, 1, 1/10)
         )


V(MC3_graph_aggregated)$color <- ifelse (V(MC3_graph_aggregated)$size > quantile_graph_aggregated[10], "darkgoldenrod3", "azure3") #color yellow if vertices is top 20%
E(MC3_graph_aggregated)$color <- "grey"
V(MC3_graph_aggregated)$size <- V(MC3_graph_aggregated)$size/0.065 
#Increase the size of nodes based on their centrality score, only those with high score will be visible

V(MC3_graph_aggregated)$id <- ifelse (V(MC3_graph_aggregated)$size*0.065 > quantile_graph_aggregated[10],V(MC3_graph_aggregated)$id,NA)
#label the vertices if vertices belongs to the top 20%


plot(MC3_graph_aggregated, edge.arrow.size = 0.25, edge.arrow.mode = "-", 
     vertex.label = V(MC3_graph_aggregated)$id, vertex.label.cex = 0.65, 
     vertex.label.font = 1, main = "Which companies are having more edges to other nodes?")

Exploring the nodes data frame

In the code chunk below, skim() of skimr package is used to display the summary statistics of MC3_nodes tibble data frame.

Show the code
skim(MC3_nodes)
Data summary
Name MC3_nodes
Number of rows 27622
Number of columns 5
_______________________
Column type frequency:
character 4
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1 6 64 0 22929 0
country 0 1 2 15 0 100 0
type 0 1 7 16 0 3 0
product_services 0 1 4 1737 0 3244 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
revenue_omu 21515 0.22 1822155 18184433 3652.23 7676.36 16210.68 48327.66 310612303 ▇▁▁▁▁

The report above reveals that there is no missing values in all fields.

In the code chunk below, datatable() of DT package is used to display mc3_nodes tibble data frame as an interactive table on the html document.

Show the code
DT::datatable(MC3_nodes)
Show the code
ggplot(data = MC3_nodes,
       aes(x = type)) +
  geom_bar()

Text Sensing with tidytext

In this section, you will learn how to perform basic text sensing using appropriate functions of tidytext package.

Simple word count

The code chunk below calculates number of times the word fish appeared in the field product_services.

Show the code
MC3_nodes %>%
  mutate(n_fish = str_count(product_services, "fish"))
# A tibble: 27,622 × 6
   id                          country type  revenue_omu product_services n_fish
   <chr>                       <chr>   <chr>       <dbl> <chr>             <int>
 1 Jones LLC                   ZH      Comp…  310612303. Automobiles           0
 2 Coleman, Hall and Lopez     ZH      Comp…  162734684. Passenger cars,…      0
 3 Aqua Advancements Sashimi … Oceanus Comp…  115004667. Holding firm wh…      0
 4 Makumba Ltd. Liability Co   Utopor… Comp…   90986413. Car service, ca…      0
 5 Taylor, Taylor and Farrell  ZH      Comp…   81466667. Fully electric …      0
 6 Harmon, Edwards and Bates   ZH      Comp…   75070435. Discount superm…      0
 7 Punjab s Marine conservati… Riodel… Comp…   72167572. Beef, pork, chi…      0
 8 Assam   Limited Liability … Utopor… Comp…   72162317. Power and Gas s…      0
 9 Ianira Starfish Sagl Import Rio Is… Comp…   68832979. Light commercia…      0
10 Moran, Lewis and Jimenez    ZH      Comp…   65592906. Automobiles, tr…      0
# ℹ 27,612 more rows
Show the code
MC3_nodes %>%
  mutate(n_seafood = str_count(product_services, "seafood"))
# A tibble: 27,622 × 6
   id                       country type  revenue_omu product_services n_seafood
   <chr>                    <chr>   <chr>       <dbl> <chr>                <int>
 1 Jones LLC                ZH      Comp…  310612303. Automobiles              0
 2 Coleman, Hall and Lopez  ZH      Comp…  162734684. Passenger cars,…         0
 3 Aqua Advancements Sashi… Oceanus Comp…  115004667. Holding firm wh…         0
 4 Makumba Ltd. Liability … Utopor… Comp…   90986413. Car service, ca…         0
 5 Taylor, Taylor and Farr… ZH      Comp…   81466667. Fully electric …         0
 6 Harmon, Edwards and Bat… ZH      Comp…   75070435. Discount superm…         0
 7 Punjab s Marine conserv… Riodel… Comp…   72167572. Beef, pork, chi…         0
 8 Assam   Limited Liabili… Utopor… Comp…   72162317. Power and Gas s…         0
 9 Ianira Starfish Sagl Im… Rio Is… Comp…   68832979. Light commercia…         0
10 Moran, Lewis and Jimenez ZH      Comp…   65592906. Automobiles, tr…         0
# ℹ 27,612 more rows

Tokenisation

The word tokenisation have different meaning in different scientific domains. In text sensing, tokenisation is the process of breaking up a given text into units called tokens. Tokens can be individual words, phrases or even whole sentences. In the process of tokenisation, some characters like punctuation marks may be discarded. The tokens usually become the input for the processes like parsing and text mining.

In the code chunk below, unnest_token() of tidytext is used to split text in product_services field into words.

Show the code
token_nodes <- MC3_nodes %>%
  unnest_tokens(word, product_services)

The two basic arguments to unnest_tokens() used here are column names. First we have the output column name that will be created as the text is unnested into it (word, in this case), and then the input column that the text comes from (product_services, in this case).

Note
  • By default, punctuation has been stripped. (Use the to_lower = FALSE argument to turn off this behavior).

  • By default, unnest_tokens() converts the tokens to lowercase, which makes them easier to compare or combine with other datasets. (Use the to_lower = FALSE argument to turn off this behavior).

Now we can visualise the words extracted by using the code chunk below.

Show the code
token_nodes %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y=n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
    labs(x = "Count",
         y = "Unique words",
         title = "Count of unique words found in product_services field")

The bar chart reveals that the unique words contains some words that may not be useful to use. For instance “a” and “to”. In the word of text mining we call those words stop words. You want to remove these words from your analysis as they are fillers used to compose a sentence.

Removing stopwords

The tidytext package has a function called stop_words that will help us clean up stop words.

Show the code
stopwords_removed <- token_nodes %>%
  anti_join(stop_words)
Note
  • Load the stop_words data included with tidytext. This data is simply a list of words that you may want to remove in a natural language analysis.

  • Then anti_join() of dplyr package is used to remove all stop words from the analysis.

We can visualise the words extracted again.

Show the code
stopwords_removed %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y=n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
    labs(x = "Count",
         y = "Unique words",
         title = "Count of unique words found in product_services field (removed stopwords)")

Show the code
glimpse(stopwords_removed)
Rows: 88,505
Columns: 5
$ id          <chr> "Jones LLC", "Coleman, Hall and Lopez", "Coleman, Hall and…
$ country     <chr> "ZH", "ZH", "ZH", "ZH", "ZH", "ZH", "Oceanus", "Oceanus", …
$ type        <chr> "Company", "Company", "Company", "Company", "Company", "Co…
$ revenue_omu <dbl> 310612303, 162734684, 162734684, 162734684, 162734684, 162…
$ word        <chr> "automobiles", "passenger", "cars", "trucks", "vans", "bus…
Show the code
edges_df <- MC3_graph %>%
  activate(edges) %>%
  as.tibble()

nodes_df <- MC3_graph %>%
  activate(nodes) %>%
  as.tibble() %>%
  rename(label = id) %>%
  mutate(id=row_number()) %>%
  select(id, label)

 # Convert the graph to undirected
MC3_graph_undirected <- as.undirected(MC3_graph)

# Perform community detection using the Louvain algorithm on the undirected graph
communities <- cluster_louvain(MC3_graph)

# Get the cluster membership of each node
membership <- membership(communities)

# Add the cluster membership information to the nodes data frame
nodes_df$group <- membership

# Plot the graph with clustered nodes using visNetwork
visNetwork(nodes_df, edges_df) %>%
  visIgraphLayout(layout = "layout_with_fr") %>%
  visEdges(arrows = "to",
           smooth = list(enabled = TRUE, type = "curvedCW"), 
           color = list(highlight = "lightgray")) %>%
  visOptions(highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE, labelOnly = TRUE),
             nodesIdSelection = TRUE) %>%
  visLayout(randomSeed = 1234)
Show the code
df_temp <- stopwords_removed %>%
  select(id, word) %>%
  rename(label = id)

merged_df <- merge(nodes_df, df_temp, by = "label")

glimpse(merged_df)
Rows: 5,132
Columns: 4
$ label <chr> "4 N.V. Marine biology", "7 GmbH & Co. KG", "7 GmbH & Co. KG", "…
$ id    <int> 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ group <dbl> 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ word  <chr> "unknown", "manufacture", "flatware", "pans", "plated", "kitchen…
Show the code
top_words <- merged_df %>%
  count(group, word, sort = TRUE) %>%
  group_by(group) %>%
  top_n(n = 1)  # Select the top frequent word in each group

ggplot(top_words, aes(x = group, y = n, fill = word)) +
  geom_col() +
  labs(x = "Group", y = "Frequency", fill = "Word") +
  theme_minimal()

Show the code
df_wordcloud <- stopwords_removed

# Count the frequency of each word
word_frequency <- df_wordcloud %>%
  group_by(word) %>%
  filter(!word %in% c("character", "0", "unknown")) %>%
  summarise(freq = n()) %>%
  arrange(desc(freq))

# Create a word cloud
set.seed(1234)  # for reproducibility of random colors
wordcloud(words = word_frequency$word, freq = word_frequency$freq, min.freq = 10
          ,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Visualize different keyword

Show the code
df <- stopwords_removed

# Filter the data frame
df_extracted <- df %>%
  filter(str_detect(word, pattern = "products"))

# Remove duplicate IDs
df_extracted_distinct <- df_extracted %>%
  distinct(id, keep_all = TRUE)

# Network visualisation for products related companies 

id3 <- MC3_edges %>%
  select(source) %>%
  rename(id = source)

id4 <- MC3_edges %>%
  select(id = target)

MC3_nodes_extracted <- rbind(id3, id4) %>%
  distinct() %>%
  left_join(df_extracted_distinct, unmatched = "drop")

MC3_graph_extracted <- tbl_graph(nodes = MC3_nodes_extracted,
                       edges = MC3_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness()
         ) %>%
  filter(betweenness_centrality >= quantile(betweenness_centrality, 0.99))


#create Visnetwork graph
edges_df1 <- MC3_graph_extracted %>%
  activate(edges) %>%
  as.tibble()

nodes_df1 <- MC3_graph_extracted %>%
  activate(nodes) %>%
  as.tibble() %>%
  rename(label = id) %>%
  mutate(id=row_number()) %>%
  select(id, label)

# # Convert the graph to undirected
# MC3_graph_undirected <- as.undirected(MC3_graph)

# Perform community detection using the Louvain algorithm on the undirected graph
communities <- cluster_louvain(MC3_graph_extracted)

# Get the cluster membership of each node
membership <- membership(communities)

# Add the cluster membership information to the nodes data frame
nodes_df1$group <- membership

# Plot the graph with clustered nodes using visNetwork
visNetwork(nodes_df1, edges_df1) %>%
  visIgraphLayout(layout = "layout_with_fr") %>%
  visEdges(arrows = "to",
           smooth = list(enabled = TRUE, type = "curvedCW"), 
           color = list(highlight = "lightgray")) %>%
  visOptions(highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE, labelOnly = TRUE),
             nodesIdSelection = TRUE,
             selectedBy = "group") %>%
  visLayout(randomSeed = 1234)
Show the code
df <- stopwords_removed

# Filter the data frame
df_extracted <- df %>%
  filter(str_detect(word, pattern = "fish") | str_detect(word, pattern = "seafood"))

# Remove duplicate IDs
df_extracted_distinct <- df_extracted %>%
  distinct(id, keep_all = TRUE)

# Network visualisation for fish and seafood related companies 

id3 <- MC3_edges %>%
  select(source) %>%
  rename(id = source)

id4 <- MC3_edges %>%
  select(id = target)

MC3_nodes_extracted <- rbind(id3, id4) %>%
  distinct() %>%
  left_join(df_extracted_distinct, unmatched = "drop")

MC3_graph_extracted <- tbl_graph(nodes = MC3_nodes_extracted,
                       edges = MC3_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness()
         ) %>%
  filter(betweenness_centrality >= quantile(betweenness_centrality, 0.99))


#create Visnetwork graph
edges_df1 <- MC3_graph_extracted %>%
  activate(edges) %>%
  as.tibble()

nodes_df1 <- MC3_graph_extracted %>%
  activate(nodes) %>%
  as.tibble() %>%
  rename(label = id) %>%
  mutate(id=row_number()) %>%
  select(id, label)

# # Convert the graph to undirected
# MC3_graph_undirected <- as.undirected(MC3_graph)

# Perform community detection using the Louvain algorithm on the undirected graph
communities <- cluster_louvain(MC3_graph_extracted)

# Get the cluster membership of each node
membership <- membership(communities)

# Add the cluster membership information to the nodes data frame
nodes_df1$group <- membership

# Plot the graph with clustered nodes using visNetwork
visNetwork(nodes_df1, edges_df1) %>%
  visIgraphLayout(layout = "layout_with_graphopt") %>%
  visEdges(arrows = "to",
           smooth = list(enabled = TRUE, type = "curvedCW"), 
           color = list(highlight = "lightgray")) %>%
  visOptions(highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE, labelOnly = TRUE),
             nodesIdSelection = TRUE,
             selectedBy = "group") %>%
  visLayout(randomSeed = 1234)
Show the code
df <- stopwords_removed

# Filter the data frame
df_extracted <- df %>%
  filter(str_detect(word, pattern = "frozen") )

# Remove duplicate IDs
df_extracted_distinct <- df_extracted %>%
  distinct(id, keep_all = TRUE)

# Network visualisation for products related companies 

id3 <- MC3_edges %>%
  select(source) %>%
  rename(id = source)

id4 <- MC3_edges %>%
  select(id = target)

MC3_nodes_extracted <- rbind(id3, id4) %>%
  distinct() %>%
  left_join(df_extracted_distinct, unmatched = "drop")

MC3_graph_extracted <- tbl_graph(nodes = MC3_nodes_extracted,
                       edges = MC3_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness()) %>%
  filter(betweenness_centrality >= quantile(betweenness_centrality, 0.99))


#create Visnetwork graph
edges_df1 <- MC3_graph_extracted %>%
  activate(edges) %>%
  as.tibble()

nodes_df1 <- MC3_graph_extracted %>%
  activate(nodes) %>%
  as.tibble() %>%
  rename(label = id) %>%
  mutate(id=row_number()) %>%
  select(id, label)

# # Convert the graph to undirected
# MC3_graph_undirected <- as.undirected(MC3_graph)

# Perform community detection using the Louvain algorithm on the undirected graph
communities <- cluster_louvain(MC3_graph_extracted)

# Get the cluster membership of each node
membership <- membership(communities)

# Add the cluster membership information to the nodes data frame
nodes_df1$group <- membership

# Plot the graph with clustered nodes using visNetwork
visNetwork(nodes_df1, edges_df1) %>%
  visIgraphLayout(layout = "layout_with_fr") %>%
  visEdges(arrows = "to",
           smooth = list(enabled = TRUE, type = "curvedCW"), 
           color = list(highlight = "lightgray")) %>%
  visOptions(highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE, labelOnly = TRUE),
             nodesIdSelection = TRUE,
             selectedBy = "group") %>%
  visLayout(randomSeed = 1234)
Show the code
df <- stopwords_removed

# Filter the data frame
df_extracted <- df %>%
  filter(str_detect(word, pattern = "fish") | str_detect(word, pattern = "seafood"))

# Remove duplicate IDs
df_extracted_distinct <- df_extracted %>%
  distinct(id, keep_all = TRUE)


id3 <- MC3_edges %>%
  select(source) %>%
  rename(id = source)

id4 <- MC3_edges %>%
  select(id = target)

MC3_nodes_extracted <- rbind(id3, id4) %>%
  distinct() %>%
  left_join(df_extracted_distinct, unmatched = "drop")

MC3_graph_extracted <- tbl_graph(nodes = MC3_nodes_extracted,
                       edges = MC3_edges,
                       directed = FALSE)

V(MC3_graph_extracted)$betweenness <- betweenness(MC3_graph_extracted, directed = F)


plot(MC3_graph_extracted,
     vertex.label.cex = .6, 
     vertex.label.color = "black", 
     vertex.size = V(MC3_graph_extracted)$betweenness/max(V(MC3_graph_extracted)$betweenness) * 50)